#
#  dest_path is the directory where you have placed the csv file
#
#
# number of bootstrap replicates
#
n_bootstrap <- 5000

dest_path<-"/Users/haroldpollack/documents/SSA_456_2021/"
dest_filename<-"tutoring_RCT.csv"
complete_file_name<-paste0(dest_path,dest_filename)
v_data_frame<-read.csv(complete_file_name)
str(v_data_frame)
## 'data.frame':    1500 obs. of  8 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ woman      : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ urban      : int  1 1 0 1 1 1 1 1 0 0 ...
##  $ Z          : int  1 1 0 1 0 0 0 1 1 0 ...
##  $ dose       : num  6.83 4.838 0.645 6.171 5.788 ...
##  $ score      : num  182.4 110.9 79.1 189.7 157.6 ...
##  $ score_error: num  14.075 -37.451 -27.34 16.024 -0.279 ...
##  $ dose_error : num  1.33 -0.662 -1.355 2.171 2.288 ...
paste("Check baseline balance of urbanicity.")
## [1] "Check baseline balance of urbanicity."
t.test(urban ~ Z, data = v_data_frame)
## 
##  Welch Two Sample t-test
## 
## data:  urban by Z
## t = -0.18093, df = 1497.9, p-value = 0.8564
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05229245  0.04346020
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6618037       0.6662198
paste("Check baseline balance of gender.")
## [1] "Check baseline balance of gender."
t.test(woman ~ Z, data = v_data_frame)
## 
##  Welch Two Sample t-test
## 
## data:  woman by Z
## t = -0.36008, df = 1497.5, p-value = 0.7188
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05483663  0.03782637
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2931034       0.3016086
#str(v_data_frame)
#describe(v_data_frame)
paste("Regression analysis of test scores as a function of gender and urbanicity alone.")
## [1] "Regression analysis of test scores as a function of gender and urbanicity alone."
model_score_nodose<-lm(score ~ woman+urban, data = v_data_frame)
summary(model_score_nodose)
## 
## Call:
## lm(formula = score ~ woman + urban, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.840 -20.510  -0.117  20.943 113.618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  129.829      1.423  91.247   <2e-16 ***
## woman         -1.090      1.701  -0.641    0.522    
## urban         14.684      1.646   8.920   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.1 on 1497 degrees of freedom
## Multiple R-squared:  0.05056,    Adjusted R-squared:  0.0493 
## F-statistic: 39.86 on 2 and 1497 DF,  p-value: < 2.2e-16
paste("Regression analysis of tutoring dose as a function of gender and urbanicity alone.")
## [1] "Regression analysis of tutoring dose as a function of gender and urbanicity alone."
paste("Wow--women get much less tutoring. Urban residents get much more. This could be an important issue.")
## [1] "Wow--women get much less tutoring. Urban residents get much more. This could be an important issue."
model_dose_noZ<-lm(dose ~ woman+urban, data = v_data_frame)
summary(model_dose_noZ)
## 
## Call:
## lm(formula = dose ~ woman + urban, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5072 -1.0886  0.0097  1.0266  4.2102 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.99976    0.06707   44.73   <2e-16 ***
## woman       -1.40041    0.08019  -17.46   <2e-16 ***
## urban        1.50741    0.07760   19.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.419 on 1497 degrees of freedom
## Multiple R-squared:  0.3074, Adjusted R-squared:  0.3064 
## F-statistic: 332.2 on 2 and 1497 DF,  p-value: < 2.2e-16
paste("Now add Z.")
## [1] "Now add Z."
paste("Regression analysis of tutoring dose as a function of treatment group Z alone.")
## [1] "Regression analysis of tutoring dose as a function of treatment group Z alone."
model_dose_Z_alone<-lm(dose ~ Z, data = v_data_frame)
summary(model_dose_Z_alone)
## 
## Call:
## lm(formula = dose ~ Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3211 -0.9247  0.0408  0.9491  3.8225 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.60337    0.05061   51.44   <2e-16 ***
## Z            1.97236    0.07177   27.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.39 on 1498 degrees of freedom
## Multiple R-squared:  0.3352, Adjusted R-squared:  0.3348 
## F-statistic: 755.3 on 1 and 1498 DF,  p-value: < 2.2e-16
paste("Here's a regression analysis of tutoring dose as a function of gender, urbanicity, and treatment group Z.")
## [1] "Here's a regression analysis of tutoring dose as a function of gender, urbanicity, and treatment group Z."
paste("We want a strong coefficient on Z so we can use it as an instrumental variable for dosage.")
## [1] "We want a strong coefficient on Z so we can use it as an instrumental variable for dosage."
paste("What happens to the other coefficients?")
## [1] "What happens to the other coefficients?"
model_dose_Z<-lm(dose ~ woman+urban+Z, data = v_data_frame)
summary(model_dose_Z)
## 
## Call:
## lm(formula = dose ~ woman + urban + Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5263 -0.6695 -0.0325  0.6511  3.2039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.02818    0.05456   37.17   <2e-16 ***
## woman       -1.42028    0.05748  -24.71   <2e-16 ***
## urban        1.49814    0.05562   26.93   <2e-16 ***
## Z            1.97782    0.05253   37.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.017 on 1496 degrees of freedom
## Multiple R-squared:  0.6444, Adjusted R-squared:  0.6437 
## F-statistic: 903.6 on 3 and 1496 DF,  p-value: < 2.2e-16
#paste("Regression analysis of test scores as a function of gender and urbanicity, and tutoring dose.")
#paste("You might think this makes things better because dose is an important confounder. But controlling for it creates other issues. We don't know how people self-select into tutoring. So the other coefficients could still be biased. For example, the highly motivated men may be the ones with high takeup. This can upward-bias the dosage coefficient and downward-bias the male coefficient.")
#model_score<-lm(score ~ dose+woman+urban, data = v_data_frame)
#summary(model_score)

paste("Regression analysis of test scores as a function of gender, urbanicity, plus treatment group Z.")
## [1] "Regression analysis of test scores as a function of gender, urbanicity, plus treatment group Z."
paste("Note that the only directly interpretable coefficient is the coefficient on Z. Gender and urbanicity will be potentially biased if they are correlated with dose.")
## [1] "Note that the only directly interpretable coefficient is the coefficient on Z. Gender and urbanicity will be potentially biased if they are correlated with dose."
model_score_Z_and_confounders<-lm(score ~ woman+urban+Z, data = v_data_frame)
summary(model_score_Z_and_confounders)
## 
## Call:
## lm(formula = score ~ woman + urban + Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.441 -19.808  -0.876  19.488 105.691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  119.734      1.518  78.868   <2e-16 ***
## woman         -1.296      1.599  -0.810    0.418    
## urban         14.588      1.548   9.426   <2e-16 ***
## Z             20.551      1.462  14.062   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.3 on 1496 degrees of freedom
## Multiple R-squared:  0.1614, Adjusted R-squared:  0.1597 
## F-statistic: 95.98 on 3 and 1496 DF,  p-value: < 2.2e-16
paste("Regression analysis of test scores as a function of treatment group Z alone. And look how similar the Z-coefficients are. Good randomization.")
## [1] "Regression analysis of test scores as a function of treatment group Z alone. And look how similar the Z-coefficients are. Good randomization."
model_score_Z_only<-lm(score ~ Z, data = v_data_frame)
summary(model_score_Z_only)
## 
## Call:
## lm(formula = score ~ Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -87.892 -20.188  -0.425  18.941  96.417 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  129.008      1.060  121.68   <2e-16 ***
## Z             20.605      1.503   13.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.11 on 1498 degrees of freedom
## Multiple R-squared:  0.1114, Adjusted R-squared:  0.1108 
## F-statistic: 187.8 on 1 and 1498 DF,  p-value: < 2.2e-16
paste("Now let's do the IV analysis of test scores as a function of gender, urbanicity, and tutoring dose. Ignore the diagnostics for now")
## [1] "Now let's do the IV analysis of test scores as a function of gender, urbanicity, and tutoring dose. Ignore the diagnostics for now"
paste("Look what happens to the coefficients on women and urbanicity--They flip.")
## [1] "Look what happens to the coefficients on women and urbanicity--They flip."
ivreg_rct = ivreg(score~dose+woman+urban | woman+urban+Z,data=v_data_frame)
summary(ivreg_rct,diagnostics=TRUE)
## 
## Call:
## ivreg(formula = score ~ dose + woman + urban | woman + urban + 
##     Z, data = v_data_frame)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -75.9059 -15.1364   0.4962  15.2117  90.6720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  98.6589     2.0261  48.694   <2e-16 ***
## dose         10.3909     0.5783  17.968   <2e-16 ***
## woman        13.4620     1.4907   9.031   <2e-16 ***
## urban        -0.9796     1.4922  -0.656    0.512    
## 
## Diagnostic tests:
##                   df1  df2 statistic p-value    
## Weak instruments    1 1496    1417.8  <2e-16 ***
## Wu-Hausman          1 1495     159.9  <2e-16 ***
## Sargan              0   NA        NA      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.15 on 1496 degrees of freedom
## Multiple R-Squared: 0.4864,  Adjusted R-squared: 0.4854 
## Wald test: 156.7 on 3 and 1496 DF,  p-value: < 2.2e-16
#ivreg_rct$coefficients
dose_coef<- ivreg_rct$coefficients[2]
#dose_coef
#summary(dose_coef)
#mean(dose_coef)

Two-stage least squares

#
#    2sls
#
paste("Now let's do conventional two-stage least squares and see how it compares to ivreg.")
## [1] "Now let's do conventional two-stage least squares and see how it compares to ivreg."
paste("First stage equation using Z to instrument for dose")
## [1] "First stage equation using Z to instrument for dose"
ols_first <- lm(dose ~ woman+urban+Z, data = v_data_frame)
summary(ols_first)
## 
## Call:
## lm(formula = dose ~ woman + urban + Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5263 -0.6695 -0.0325  0.6511  3.2039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.02818    0.05456   37.17   <2e-16 ***
## woman       -1.42028    0.05748  -24.71   <2e-16 ***
## urban        1.49814    0.05562   26.93   <2e-16 ***
## Z            1.97782    0.05253   37.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.017 on 1496 degrees of freedom
## Multiple R-squared:  0.6444, Adjusted R-squared:  0.6437 
## F-statistic: 903.6 on 3 and 1496 DF,  p-value: < 2.2e-16
v_data_frame$dose_hat <- fitted(ols_first)
paste("Second stage equation using predicted dose from the first stage")
## [1] "Second stage equation using predicted dose from the first stage"
ols_second <- lm(score ~ dose_hat+woman+urban, data = v_data_frame)
summary(ols_second)
## 
## Call:
## lm(formula = score ~ dose_hat + woman + urban, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.441 -19.808  -0.876  19.488 105.691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  98.6589     2.5890  38.107  < 2e-16 ***
## dose_hat     10.3909     0.7390  14.062  < 2e-16 ***
## woman        13.4620     1.9049   7.067 2.42e-12 ***
## urban        -0.9796     1.9068  -0.514    0.608    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.3 on 1496 degrees of freedom
## Multiple R-squared:  0.1614, Adjusted R-squared:  0.1597 
## F-statistic: 95.98 on 3 and 1496 DF,  p-value: < 2.2e-16
paste("Note that the coefficients exactly match what we got in ivreg. Standard errors a little different.")
## [1] "Note that the coefficients exactly match what we got in ivreg. Standard errors a little different."
paste("Let's run the analogous regression uninstrumented relationships.")
## [1] "Let's run the analogous regression uninstrumented relationships."
paste("Regression analysis of test scores as a function of gender and urbanicity, and uninstrumented tutoring dose.")
## [1] "Regression analysis of test scores as a function of gender and urbanicity, and uninstrumented tutoring dose."
paste("Everything will be effed-up by selection bias. ")
## [1] "Everything will be effed-up by selection bias. "
paste("You might think this makes things better because dose is an important confounder. Nope. Everything is effed-up by selection bias. We thnk we are helping by including dose in the model. But we don't know how people self-select into tutoring. So the other coefficients are biased.")
## [1] "You might think this makes things better because dose is an important confounder. Nope. Everything is effed-up by selection bias. We thnk we are helping by including dose in the model. But we don't know how people self-select into tutoring. So the other coefficients are biased."
paste("Dose coefficient is way upward biased.")
## [1] "Dose coefficient is way upward biased."
paste("This leads the woman coefficient to be way upward biased to fit the women's data, sinc most women weren't tutored.")
## [1] "This leads the woman coefficient to be way upward biased to fit the women's data, sinc most women weren't tutored."
paste("Urban coefficient is way downward biased to fit the data there.")
## [1] "Urban coefficient is way downward biased to fit the data there."
ols_unstrumented <- lm(score ~ dose+woman+urban, data = v_data_frame)
summary(ols_unstrumented)
## 
## Call:
## lm(formula = score ~ dose + woman + urban, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.167 -14.059   0.428  14.774  80.192 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  84.4226     1.5242   55.39  < 2e-16 ***
## dose         15.1367     0.3843   39.39  < 2e-16 ***
## woman        20.1081     1.3080   15.37  < 2e-16 ***
## urban        -8.1334     1.2910   -6.30  3.9e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.1 on 1496 degrees of freedom
## Multiple R-squared:  0.5339, Adjusted R-squared:  0.533 
## F-statistic: 571.3 on 3 and 1496 DF,  p-value: < 2.2e-16
v_data_frame$score_hat <- fitted(ols_unstrumented)
#ggplot(v_data_frame,aes(dose,score_hat)+geom_point())
#
#    Note to self: Go back and include the score_error term
#
ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_point(aes(y = score_error), color = "yellow") + 
  geom_line(aes(y = score_hat), color="steelblue",size=2)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_point(aes(y = score_error), color = "yellow") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(urban ~woman)

v_data_frame$F_urban <- factor(v_data_frame$urban,
levels = c(0,1),
labels = c("non-urban", "urban"))
v_data_frame$F_woman <- factor(v_data_frame$woman,
levels = c(0,1),
labels = c("Other gender", "Woman"))
v_data_frame$F_Z <- factor(v_data_frame$Z,
levels = c(0,1),
labels = c("Control group", "Treatment group"))
v_data_frame$F_urban_woman<- factor(v_data_frame$urban+2*v_data_frame$woman,
levels = c(0,1,2,3),
labels = c("Rural male", "urban male","rural woman","urban woman"))
ols_score_error <- lm(score_error ~ dose+F_urban_woman, data = v_data_frame)
summary(ols_score_error)
## 
## Call:
## lm(formula = score_error ~ dose + F_urban_woman, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.522 -13.926   0.367  14.701  80.255 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -15.02578    1.58425  -9.484  < 2e-16 ***
## dose                       5.04934    0.38380  13.156  < 2e-16 ***
## F_urban_womanurban male   -8.41777    1.49259  -5.640 2.03e-08 ***
## F_urban_womanrural woman   7.82581    2.14396   3.650 0.000271 ***
## F_urban_womanurban woman   0.03495    1.63625   0.021 0.982959    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.06 on 1495 degrees of freedom
## Multiple R-squared:  0.1045, Adjusted R-squared:  0.1021 
## F-statistic: 43.59 on 4 and 1495 DF,  p-value: < 2.2e-16
v_data_frame$score_error_hat <- fitted(ols_score_error)
ols_group_score <- lm(score ~ F_urban_woman, data = v_data_frame)
summary(ols_group_score)
## 
## Call:
## lm(formula = score ~ F_urban_woman, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -98.365 -20.564  -0.145  20.946 112.590 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               129.430      1.580  81.901  < 2e-16 ***
## F_urban_womanurban male    15.293      1.952   7.835 8.82e-15 ***
## F_urban_womanrural woman    0.337      2.988   0.113     0.91    
## F_urban_womanurban woman   13.518      2.339   5.780 9.07e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.11 on 1496 degrees of freedom
## Multiple R-squared:  0.05078,    Adjusted R-squared:  0.04887 
## F-statistic: 26.68 on 3 and 1496 DF,  p-value: < 2.2e-16
v_data_frame$group_score_hat <- fitted(ols_group_score)

ols_score_error <- lm(score_error ~ dose+F_urban_woman, data = v_data_frame)
summary(ols_score_error)
## 
## Call:
## lm(formula = score_error ~ dose + F_urban_woman, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.522 -13.926   0.367  14.701  80.255 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -15.02578    1.58425  -9.484  < 2e-16 ***
## dose                       5.04934    0.38380  13.156  < 2e-16 ***
## F_urban_womanurban male   -8.41777    1.49259  -5.640 2.03e-08 ***
## F_urban_womanrural woman   7.82581    2.14396   3.650 0.000271 ***
## F_urban_womanurban woman   0.03495    1.63625   0.021 0.982959    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.06 on 1495 degrees of freedom
## Multiple R-squared:  0.1045, Adjusted R-squared:  0.1021 
## F-statistic: 43.59 on 4 and 1495 DF,  p-value: < 2.2e-16
v_data_frame$meaned_score_error_hat <- fitted(ols_score_error)+v_data_frame$group_score_hat


# 
# interaction
#
v_data_frame$dose_Z<-v_data_frame$dose*v_data_frame$Z
v_data_frame$dose_Z_urban<-v_data_frame$dose*v_data_frame$Z*v_data_frame$urban
v_data_frame$dose_Z_woman<-v_data_frame$dose*v_data_frame$Z*v_data_frame$woman
ols_interaction <- lm(score ~ dose+urban+woman+dose_Z, data = v_data_frame)
summary(ols_interaction)
## 
## Call:
## lm(formula = score ~ dose + urban + woman + dose_Z, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.656 -13.217   0.174  13.503  78.123 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  75.1264     1.6705   44.97   <2e-16 ***
## dose         20.8850     0.6213   33.62   <2e-16 ***
## urban       -13.3313     1.3179  -10.12   <2e-16 ***
## woman        25.3264     1.3339   18.99   <2e-16 ***
## dose_Z       -4.1340     0.3598  -11.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.23 on 1495 degrees of freedom
## Multiple R-squared:  0.5718, Adjusted R-squared:  0.5706 
## F-statistic:   499 on 4 and 1495 DF,  p-value: < 2.2e-16
v_data_frame$dose_factors<-v_data_frame$dose*v_data_frame$Z*v_data_frame$F_urban_woman
## Warning in Ops.factor(v_data_frame$dose * v_data_frame$Z,
## v_data_frame$F_urban_woman): '*' not meaningful for factors
ols_interaction_full <- lm(score ~ dose+urban+woman+dose_Z+dose_Z_woman+dose_Z_urban, data = v_data_frame)
summary(ols_interaction_full)
## 
## Call:
## lm(formula = score ~ dose + urban + woman + dose_Z + dose_Z_woman + 
##     dose_Z_urban, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.678 -13.526   0.018  13.544  80.829 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   75.2011     1.8148  41.437   <2e-16 ***
## dose          21.2427     0.6387  33.258   <2e-16 ***
## urban        -15.4597     1.7263  -8.955   <2e-16 ***
## woman         27.4290     1.7766  15.439   <2e-16 ***
## dose_Z        -4.8644     0.5845  -8.322   <2e-16 ***
## dose_Z_woman  -0.9098     0.5404  -1.684   0.0925 .  
## dose_Z_urban   0.9346     0.5265   1.775   0.0761 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.21 on 1493 degrees of freedom
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5717 
## F-statistic: 334.4 on 6 and 1493 DF,  p-value: < 2.2e-16
v_data_frame$score_interaction_hat <- fitted(ols_interaction_full)

ols_score_dose_only <- lm(score ~ dose, data = v_data_frame)
summary(ols_score_dose_only)
## 
## Call:
## lm(formula = score ~ dose, data = v_data_frame)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.134 -15.801  -0.196  15.360 100.366 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  95.3441     1.3688   69.66   <2e-16 ***
## dose         12.2510     0.3449   35.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.75 on 1498 degrees of freedom
## Multiple R-squared:  0.4572, Adjusted R-squared:  0.4568 
## F-statistic:  1262 on 1 and 1498 DF,  p-value: < 2.2e-16
v_data_frame$score_dose_only_hat <- fitted(ols_score_dose_only)
p_dose_only <-ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_dose_only_hat), color="steelblue", linetype="twodash",size=2)
p_dose_only_facets<- ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_dose_only_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_woman)
p_dose_only+ggtitle("Regressing score on dose")

p_dose_only_facets+ggtitle("Regressing score on dose--stratified by gender and urbanicity")

#
#    Plots
#
ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_woman)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_woman ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_point(aes(y = score_error), color = "yellow") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban_woman ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_point(aes(y = score_error), color = "yellow") +
  geom_point(aes(y = meaned_score_error_hat), color = "red") +
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_grid(F_urban_woman ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  #geom_point(aes(y = score_error), color = "yellow") +
  geom_point(aes(y = meaned_score_error_hat), color = "red") +
  labs(title="Test scores (dots) vs. dose. Red line is fitted residual vs. dose.") +
  facet_grid(F_urban_woman ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  #geom_point(aes(y = score_error), color = "yellow") +
  geom_point(aes(y = score_interaction_hat), color = "red") +
  labs(title="Test scores (dots) vs. dose. Red line is fitted score in full interaction model vs. dose.") +
  facet_grid(F_urban_woman ~F_Z)

ggplot(v_data_frame, aes(x=dose)) + 
  geom_point(aes(y = score), color = "darkred") + 
  geom_line(aes(y = score_hat), color="steelblue", linetype="twodash",size=2)+facet_wrap(~F_Z)

Bootstrapping

paste("Now let's do the Wald estimator, with and without bootstrapping.")
## [1] "Now let's do the Wald estimator, with and without bootstrapping."
v1<- aggregate(x = v_data_frame$dose,    
          by = list(v_data_frame$Z),        
          FUN = mean)    

v2<- aggregate(x = v_data_frame$score,     
          by = list(v_data_frame$Z),  
          FUN = mean)                 
paste("Here's the Wald estimator of the value of tutoring (per dose)",(v2[2,2]-v2[1,2])/(v1[2,2]-v1[1,2]))
## [1] "Here's the Wald estimator of the value of tutoring (per dose) 10.4467522660955"
#
# Bootstrap 95% CI for Wald estimator
Wald_ratio <- function(data, indices) {
  d <- data[indices,] # allows boot to select sample
   v1<- aggregate(x = d$dose,    
                   by = list(d$Z),        
                   FUN = mean)    
    
    v2<- aggregate(x = d$score,     
                   by = list(d$Z),  
                   FUN = mean)   
    d<-(v2[2,2]-v2[1,2])/(v1[2,2]-v1[1,2])
  return(d)
}
#
# bootstrapping with B replications
#
results <- boot(data=v_data_frame, statistic=Wald_ratio,
   R=n_bootstrap)
results
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = v_data_frame, statistic = Wald_ratio, R = n_bootstrap)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 10.44675 -0.01261882   0.6101649
paste0("Bootstrap histogram of Wald estimator for TOT treatment effect, qplot to test normality w/",n_bootstrap," replications. Yup-simulated normal data looks normal.") 
## [1] "Bootstrap histogram of Wald estimator for TOT treatment effect, qplot to test normality w/5000 replications. Yup-simulated normal data looks normal."
plot(results)

#
#     95% CI and median
#
bs_estimates<-tibble(results$t)
LB<-round(0.025*n_bootstrap)
med<-round(0.50*n_bootstrap)
UB<-round(0.975*n_bootstrap)
bs_estimates<-bs_estimates[order(results$t),]
paste0("95% confidence interval of Wald estimate of TOT treatment effect: (", round(bs_estimates[LB,],digits=2)," , ",round(bs_estimates[UB,],digits=2),")") 
## [1] "95% confidence interval of Wald estimate of TOT treatment effect: (9.25 , 11.6)"
paste0("Median Wald estimate of TOT treatment effect: ", round(bs_estimates[med,],digits=2)," , Almost identical IVREG coefficient ", round(dose_coef,digits=2)) 
## [1] "Median Wald estimate of TOT treatment effect: 10.44 , Almost identical IVREG coefficient 10.39"
paste("I was rather surprised by how similar the Wald estimator was to the TOT. The beauty of large-N simulated data that follows the normal distribution.") 
## [1] "I was rather surprised by how similar the Wald estimator was to the TOT. The beauty of large-N simulated data that follows the normal distribution."

Now let’s plot some densities